.
# install packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(gganimate)
library(magick)
## Linking to ImageMagick 6.9.9.39
## Enabled features: cairo, fontconfig, freetype, lcms, pango, rsvg, webp
## Disabled features: fftw, ghostscript, x11
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(corrplot)
## corrplot 0.84 loaded
library(beepr)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(praise)
library(transformr)
library(sf)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
##
## Attaching package: 'sf'
## The following object is masked from 'package:transformr':
##
## st_normalize
library(here)
## here() starts at /Users/allisonbailey/Desktop/Fall 2019 Courses/ESM 206-Data/Lab Assignments/Lab 10/esm206-f2019-lab-10
homes <- read_csv("slo_homes.csv")
## Parsed with column specification:
## cols(
## City = col_character(),
## Price = col_double(),
## Bedrooms = col_double(),
## Bathrooms = col_double(),
## SqFt = col_double(),
## PricePerSqFt = col_double(),
## Status = col_character()
## )
homes_clean <- homes %>%
clean_names() %>%
filter(city %in% c("San Luis Obispo", "Atascadero", "Arroyo Grande"))
Look at correlations between numeric variables (checking to see if we think that collinearity might be a concern)
homes_cor <- cor(homes_clean[2:6])
homes_cor
## price bedrooms bathrooms sq_ft price_per_sq_ft
## price 1.0000000 0.31152578 0.4991453 0.6618575 0.73664029
## bedrooms 0.3115258 1.00000000 0.7519186 0.6960233 -0.09994633
## bathrooms 0.4991453 0.75191865 1.0000000 0.8046112 0.07624770
## sq_ft 0.6618575 0.69602326 0.8046112 1.0000000 0.12549708
## price_per_sq_ft 0.7366403 -0.09994633 0.0762477 0.1254971 1.00000000
Make a correlogram (plot of correlations)
corrplot(homes_cor)
beep(20)
Let’s start with a complete model:
home_lm <- lm(price ~ city + bedrooms + bathrooms + sq_ft + status, data = homes_clean)
home_lm
##
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status,
## data = homes_clean)
##
## Coefficients:
## (Intercept) cityAtascadero citySan Luis Obispo
## 184130.9 -167396.4 31018.1
## bedrooms bathrooms sq_ft
## -161645.5 48692.0 389.1
## statusRegular statusShort Sale
## 303964.6 -19828.6
# P = 184130.9 - 167396(Atascadero) + 31018(SLO) - 161645(bedrooms)
If everything else about the home is the same, then we would expect a home in Atascadero to sell for $167,396 dollars LESS than a home in Arroyo Grande, on average.
home_lm2 <- lm(price ~ city + sq_ft + status, data = homes_clean)
home_lm2
##
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes_clean)
##
## Coefficients:
## (Intercept) cityAtascadero citySan Luis Obispo
## -105821 -182587 70279
## sq_ft statusRegular statusShort Sale
## 325 315441 31284
AIC values:
AIC(home_lm)
## [1] 3576.834
AIC(home_lm2)
## [1] 3584.3
summary(home_lm)
##
## Call:
## lm(formula = price ~ city + bedrooms + bathrooms + sq_ft + status,
## data = homes_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -843938 -115963 -12298 109718 3445077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 184130.93 143368.84 1.284 0.20157
## cityAtascadero -167396.39 78701.95 -2.127 0.03552 *
## citySan Luis Obispo 31018.14 114895.10 0.270 0.78766
## bedrooms -161645.51 49414.32 -3.271 0.00141 **
## bathrooms 48692.04 52408.63 0.929 0.35476
## sq_ft 389.12 53.17 7.318 3.43e-11 ***
## statusRegular 303964.64 105942.81 2.869 0.00488 **
## statusShort Sale -19828.56 86335.35 -0.230 0.81875
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 380600 on 117 degrees of freedom
## Multiple R-squared: 0.5682, Adjusted R-squared: 0.5424
## F-statistic: 22 on 7 and 117 DF, p-value: < 2.2e-16
summary(home_lm2)
##
## Call:
## lm(formula = price ~ city + sq_ft + status, data = homes_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -653118 -120914 -8844 101402 3644738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105820.71 118697.40 -0.892 0.3745
## cityAtascadero -182586.90 80683.44 -2.263 0.0254 *
## citySan Luis Obispo 70278.97 115846.31 0.607 0.5452
## sq_ft 325.03 31.54 10.306 <2e-16 ***
## statusRegular 315441.26 109749.65 2.874 0.0048 **
## statusShort Sale 31284.44 87356.11 0.358 0.7209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 395100 on 119 degrees of freedom
## Multiple R-squared: 0.5268, Adjusted R-squared: 0.5069
## F-statistic: 26.49 on 5 and 119 DF, p-value: < 2.2e-16
Now I’m going to create a nice regression table with ‘stargazer’ :
stargazer(home_lm, home_lm2, type = "html")
| Dependent variable: | ||
| price | ||
| (1) | (2) | |
| cityAtascadero | -167,396.400** | -182,586.900** |
| (78,701.950) | (80,683.440) | |
| citySan Luis Obispo | 31,018.140 | 70,278.970 |
| (114,895.100) | (115,846.300) | |
| bedrooms | -161,645.500*** | |
| (49,414.320) | ||
| bathrooms | 48,692.040 | |
| (52,408.630) | ||
| sq_ft | 389.120*** | 325.028*** |
| (53.171) | (31.538) | |
| statusRegular | 303,964.600*** | 315,441.300*** |
| (105,942.800) | (109,749.700) | |
| statusShort Sale | -19,828.560 | 31,284.440 |
| (86,335.350) | (87,356.110) | |
| Constant | 184,130.900 | -105,820.700 |
| (143,368.800) | (118,697.400) | |
| Observations | 125 | 125 |
| R2 | 0.568 | 0.527 |
| Adjusted R2 | 0.542 | 0.507 |
| Residual Std. Error | 380,586.300 (df = 117) | 395,084.400 (df = 119) |
| F Statistic | 21.997*** (df = 7; 117) | 26.491*** (df = 5; 119) |
| Note: | p<0.1; p<0.05; p<0.01 | |
Now let’s check out the diagnostic plots to see if our assumptions (normality of residulas and homoscedasticity) are satisfied or if we’re really concerned.
plot(home_lm2)
Make some home price predictions with new data
We’re going to be using that simplified model (home_lm2)
new_df <- data.frame(city = rep(c("San Luis Obispo", "Arroyo Grande", "Atascadero"), each = 10),
sq_ft = rep(seq(0, 5000, length = 10)),
status = "Regular")
new_df
## city sq_ft status
## 1 San Luis Obispo 0.0000 Regular
## 2 San Luis Obispo 555.5556 Regular
## 3 San Luis Obispo 1111.1111 Regular
## 4 San Luis Obispo 1666.6667 Regular
## 5 San Luis Obispo 2222.2222 Regular
## 6 San Luis Obispo 2777.7778 Regular
## 7 San Luis Obispo 3333.3333 Regular
## 8 San Luis Obispo 3888.8889 Regular
## 9 San Luis Obispo 4444.4444 Regular
## 10 San Luis Obispo 5000.0000 Regular
## 11 Arroyo Grande 0.0000 Regular
## 12 Arroyo Grande 555.5556 Regular
## 13 Arroyo Grande 1111.1111 Regular
## 14 Arroyo Grande 1666.6667 Regular
## 15 Arroyo Grande 2222.2222 Regular
## 16 Arroyo Grande 2777.7778 Regular
## 17 Arroyo Grande 3333.3333 Regular
## 18 Arroyo Grande 3888.8889 Regular
## 19 Arroyo Grande 4444.4444 Regular
## 20 Arroyo Grande 5000.0000 Regular
## 21 Atascadero 0.0000 Regular
## 22 Atascadero 555.5556 Regular
## 23 Atascadero 1111.1111 Regular
## 24 Atascadero 1666.6667 Regular
## 25 Atascadero 2222.2222 Regular
## 26 Atascadero 2777.7778 Regular
## 27 Atascadero 3333.3333 Regular
## 28 Atascadero 3888.8889 Regular
## 29 Atascadero 4444.4444 Regular
## 30 Atascadero 5000.0000 Regular
Use the ‘predict()’ to make new predicitons using home_lm2:
predict_df <- predict(home_lm2, newdata = new_df)
predict_df
## 1 2 3 4 5 6
## 279899.52 460470.61 641041.69 821612.78 1002183.86 1182754.94
## 7 8 9 10 11 12
## 1363326.03 1543897.11 1724468.20 1905039.28 209620.55 390191.63
## 13 14 15 16 17 18
## 570762.72 751333.80 931904.89 1112475.97 1293047.06 1473618.14
## 19 20 21 22 23 24
## 1654189.23 1834760.31 27033.65 207604.73 388175.82 568746.90
## 25 26 27 28 29 30
## 749317.99 929889.07 1110460.16 1291031.24 1471602.33 1652173.41
# Bind that together with the new_df:
full_df <- data.frame(new_df, predict_df)
full_df
## city sq_ft status predict_df
## 1 San Luis Obispo 0.0000 Regular 279899.52
## 2 San Luis Obispo 555.5556 Regular 460470.61
## 3 San Luis Obispo 1111.1111 Regular 641041.69
## 4 San Luis Obispo 1666.6667 Regular 821612.78
## 5 San Luis Obispo 2222.2222 Regular 1002183.86
## 6 San Luis Obispo 2777.7778 Regular 1182754.94
## 7 San Luis Obispo 3333.3333 Regular 1363326.03
## 8 San Luis Obispo 3888.8889 Regular 1543897.11
## 9 San Luis Obispo 4444.4444 Regular 1724468.20
## 10 San Luis Obispo 5000.0000 Regular 1905039.28
## 11 Arroyo Grande 0.0000 Regular 209620.55
## 12 Arroyo Grande 555.5556 Regular 390191.63
## 13 Arroyo Grande 1111.1111 Regular 570762.72
## 14 Arroyo Grande 1666.6667 Regular 751333.80
## 15 Arroyo Grande 2222.2222 Regular 931904.89
## 16 Arroyo Grande 2777.7778 Regular 1112475.97
## 17 Arroyo Grande 3333.3333 Regular 1293047.06
## 18 Arroyo Grande 3888.8889 Regular 1473618.14
## 19 Arroyo Grande 4444.4444 Regular 1654189.23
## 20 Arroyo Grande 5000.0000 Regular 1834760.31
## 21 Atascadero 0.0000 Regular 27033.65
## 22 Atascadero 555.5556 Regular 207604.73
## 23 Atascadero 1111.1111 Regular 388175.82
## 24 Atascadero 1666.6667 Regular 568746.90
## 25 Atascadero 2222.2222 Regular 749317.99
## 26 Atascadero 2777.7778 Regular 929889.07
## 27 Atascadero 3333.3333 Regular 1110460.16
## 28 Atascadero 3888.8889 Regular 1291031.24
## 29 Atascadero 4444.4444 Regular 1471602.33
## 30 Atascadero 5000.0000 Regular 1652173.41
Make a graph with the actuall data, and what our model actually predicts:
ggplot() +
geom_point(data = homes_clean,
aes(x = sq_ft, y = price, color = city, pch = city)) +
geom_line(data = full_df,
aes(x = sq_ft, y = predict_df, color = city)) +
theme_light()
ggplot(data = homes_clean, aes(x= sq_ft, y = price)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~city)
‘sf’ = simple features
Sticky geometries! Which means we can just work with attributes like a normal data frame for wrangling, viz, etc.
Get CA dams data:
dams <- read_csv("ca_dams.csv") %>%
clean_names() %>%
drop_na(latitude) %>%
drop_na(longitude) %>%
drop_na(year_completed)
## Parsed with column specification:
## cols(
## .default = col_character(),
## RECORDID = col_double(),
## STATEID = col_logical(),
## LONGITUDE = col_double(),
## LATITUDE = col_double(),
## DISTANCE = col_double(),
## YEAR_COMPLETED = col_double(),
## DAM_LENGTH = col_double(),
## DAM_HEIGHT = col_double(),
## STRUCTURAL_HEIGHT = col_double(),
## HYDRAULIC_HEIGHT = col_double(),
## NID_HEIGHT = col_double(),
## MAX_DISCHARGE = col_double(),
## MAX_STORAGE = col_double(),
## NORMAL_STORAGE = col_double(),
## NID_STORAGE = col_double(),
## SURFACE_AREA = col_double(),
## DRAINAGE_AREA = col_double(),
## INSPECTION_FREQUENCY = col_double(),
## SPILLWAY_WIDTH = col_double(),
## VOLUME = col_double()
## # ... with 6 more columns
## )
## See spec(...) for full column specifications.
## Warning: 109 parsing failures.
## row col expected actual file
## 1337 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1338 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1339 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1340 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## 1341 FED_OTHER 1/0/T/F/TRUE/FALSE CE 'ca_dams.csv'
## .... ......... .................. ...... .............
## See problems(...) for more details.
Convert lat/lon numbers to simple features data (sf)
dams_sf <- st_as_sf(dams, coords = c("longitude", "latitude"))
st_crs(dams_sf) <- 4326
plot(dams_sf)
## Warning: plotting the first 9 out of 67 attributes; use max.plot = 67 to
## plot all
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
Now, lets read n the shapefile data for the state of CA (outer boundary TIGER shapefile data)
ca_border <- read_sf(here::here("ca_state_border"), layer = "CA_State_TIGER2016")
plot(ca_border)
## Warning: plotting the first 9 out of 14 attributes; use max.plot = 14 to
## plot all
Now, plot dams n CA over the top of the CA outline:
ggplot() +
geom_sf(data = ca_border, fill = "red") +
geom_sf(data = dams_sf, aes(size = dam_height, color = county), alpha = 0.5, show.legend = FALSE) +
theme_minimal()
Making an animated plot:
ggplot() +
geom_sf(data = ca_border) +
geom_sf(data = dams_sf, size = 1.5, color = "gray50") +
theme_void() +
labs(title = 'Year: {round(frame_time, 0)}') +
transition_time(year_completed) +
shadow_mark(alpha = 1)